/* Stata do file for:
Koplenig - Against statistical significance testing in corpus linguistics
last checked: 05/24/2016*/
* version control */
version 14.0

*************************************************************************************************************
/* SECTION 2 */
/*How many ways to choose 100 from 1,000,000

       ln(comb(n,k)    =  ln( (n!)/( k! (n-k)! ) )
                       =  ln(n!) - ln(k!) - ln(n-k)!
        use Stata's lnfactorial
        */


clear
set obs 101
gen d_1=_n-1
forvalues i=2/4 {
expand 101
bysort d*: gen d_`i'=_n-1
egen rowtotal=rowtotal(d*)
drop if rowtotal>100
drop rowtotal
}

keep if d_1+d_2+d_3+d_4==100
sort d*

save test, replace
use test, clear
*keep if abs(d_1/(d_1+d_3)-d_2/(d_2+d_4))>=abs(.2857-.50)

/* get all p-values for this example

           |          col
       row |         1          2 |     Total
-----------+----------------------+----------
         1 |        20         15 |        35 
         2 |        50         15 |        65 
-----------+----------------------+----------
     Total |        70         30 |       100 
 

 */

gen n_1=150000
gen n_2=150000
gen n_3=350000
gen n_4=350000
gen N=1000000
gen K=100
gen double p=exp(lnfactorial(n_1)+lnfactorial(n_2)+lnfactorial(n_3)+lnfactorial(n_4)-(lnfactorial(d_1)+lnfactorial(d_2)+lnfactorial(d_3)+lnfactorial(d_4)+lnfactorial(n_1-d_1)+lnfactorial(n_2-d_2)+lnfactorial(n_3-d_3)+lnfactorial(n_4-d_4))-lnfactorial(N)+lnfactorial(K)+lnfactorial(N-K))
*replace p=exp(p)
sum p
di %6.5f r(sum)



/* make results reproducible */

set seed 322937

/* generate hypothetical samples */

tabi 20 15 \ 50 15, exact col chi2


clear
set obs 1000000000

gen sa_four=rhypergeometric(1000000,500000,100)
gen sa_five=100-sa_four
gen blue_four=rhypergeometric(500000,150000,sa_four) 
gen blue_five=rhypergeometric(500000,150000,sa_five) 
gen red_four=sa_four-blue_four   
gen red_five=sa_five-blue_five

gen match=1
collapse (sum) match, by(b* r*) fast

gen p_diff=blue_four/(blue_four+red_four)-blue_five/(blue_five+red_five)
drop if p==.

gen abs=abs(p_diff)
sum abs
list if abs==r(max) 


mylabels -40 -20 0 20 40, myscale(@/100) local(labels1) 
histogram p_diff [fweight=match], bin(51) frequency normal  ///
fcolor(orange) lcolor(white) normopts(lcolor(blue) lwidth(thick)) ///
ylabel(#6, angle(h) nogrid format(%12.0gc)) ///
xtitle("Percentage difference found in sample")  /// 
ytitle("Frequency of occurence") ///
xlabel(`labels1', angle(h)nogrid) title("" ) scheme(s2mono) graphregion(color(white)) ///
xscale(nofextend) yscale(nofextend) 
	graph export figure1.png, replace wid(1600) heig(1130)


sum match if abs(p_diff)<.10
local nom=r(sum)
sum match
local prob=`nom'/r(sum)
di `prob'

sum match if abs(p_diff)>=abs(.2857-.50)
di r(sum)
local nom=r(sum)
sum match
local prob=`nom'/r(sum)
di `prob'

/* simulate how many samples of 1,000,000 yield at least Table 1 percentage diff */

/* A: sample size 10,000 */
clear
set obs 1000000000
/* make results reproducible */

set seed 310516

gen sa_four=rhypergeometric(1000000,500000,10000)
gen sa_five=10000-sa_four
gen blue_four=rhypergeometric(500000,150000,sa_four) 
gen blue_five=rhypergeometric(500000,150000,sa_five) 
gen red_four=sa_four-blue_four   
gen red_five=sa_five-blue_five


keep if abs(blue_four/(blue_four+red_four)-blue_five/(blue_five+red_five))>=abs(.2857-.50)
di _N/1000000000

/* B: sample size 4 */
clear
set obs 1000000000
/* make results reproducible */

set seed 310516

gen sa_four=rhypergeometric(1000000,500000,4)
gen sa_five=4-sa_four
gen blue_four=rhypergeometric(500000,150000,sa_four) 
gen blue_five=rhypergeometric(500000,150000,sa_five) 
gen red_four=sa_four-blue_four   
gen red_five=sa_five-blue_five


keep if abs(blue_four/(blue_four+red_four)-blue_five/(blue_five+red_five))>=1
di _N/1000000000

exit
*************************
/* Section 3.2 */  
*************************
clear  
set obs 1 
gen N=.
gen p1=.  
gen p2=.  
 
 
forvalues p1=0(.005)1 {  
		forvalues p2=0(.005)1 { 
				sampsi `p1' `p2', alpha(.01) 
				local new = _N + 1  
  set obs `new' 
  replace p1=`p1' in `new'  
  replace p2=`p2' in `new'  
  replace N=2*r(N_1) in `new'  
  } 
  } 
drop if N==. 
save corpussig, replace  
 
use corpussig, clear  
gen difference=abs(p1-p2)
bysort N diff: gen id=_n 
keep if id==1
gen logN=log10(N)  
mylabels 10 100 1000 10000 100000 500000, myscale(log10(@)) local(labels1)  
mylabels 1 10 20 30 40 50 60 70 80 90 100, myscale(@/100) local(labels2) 
scatter logN di ///
, msize(tiny) msymbol(Oh) mcolor(orange) ///  
xlabel(`labels2', angle(h)nogrid) xtitle(Percentage difference) ///
ylabel(`labels1', angle(h) nogrid) ytitle(Sample size)  ///  
title("" ) scheme(s2mono) graphregion(color(white)) ///
xscale(nofextend) yscale(nofextend)  
	graph export figure2.png, replace wid(1600) heig(1130)  

**************************
*/* Section 3.4 */  
**************************

sampsi .5 .25, alpha(.000001)

di r(N_1)*2

sampsi .5 .55, alpha(.98)

di r(N_1)*2

clear
set obs 290
generate X=1
generate Y=1
gen media="written"
local p1=290*.5
replace Y=0 if _n<=290
replace X=0 if _n<`p1'
save p1, replace

clear
set obs 290
generate X=1
generate Y=1
gen media="written"
local p2=290*.25
replace X=0 if _n<`p2'
append using p1
erase p1.dta
save written, replace

clear
set obs 379
generate X=1
generate Y=1
gen media="spoken"
local p1=379*.5
replace Y=0 if _n<=379
replace X=0 if _n<`p1'
save p1, replace

clear
set obs 379
generate X=1
generate Y=1
gen media="spoken"
local p2=379*.55
replace X=0 if _n<`p2'
append using p1
erase p1.dta

append using written
erase written.dta
save balancing, replace


preserve
set seed 19032104
sample 270 if media=="written", count
sample 30 if media=="spoken", count
tab X Y, col chi2 nofreq
restore

clear
set obs 300
gen fractionspoken=_n
gen pwert=.
save pwerte, replace

forvalues i=0(1)300 {
								local spoken=`i'
								local written=300-`i'
								use balancing, clear
								sample `written' if media=="written", count
								sample `spoken' if media=="spoken", count
								qui tab X Y, col chi2 nofreq
								local pwert=r(p)
								use pwerte, clear
								replace pwert=`pwert' if fractionspoken==`i'
								save pwerte, replace
								}

use pwerte, clear

 sum pwert
 local denom=r(N)
 sum pwert if pwert<=0.05
 di 1-r(N)/`denom'
 
scatter pwert fractionspoken ///
, msize(medium) msymbol(Oh) mcolor(orange) yline(0.05, lcolor(blue) lwidth(thick)) ///
xlabel(, angle(h)nogrid) ytitle(p-value) ///
ylabel(0 .05 .2 .4 .6 .8 1, angle(h) nogrid) xtitle("Number of elements from the spoken part of the library") scheme(s2mono)  ///
title("" ) graphregion(color(white)) yscale(nofextend) xscale(nofextend)  
	graph export figure3.png, replace wid(1600) heig(1130)

************************
/* Section 4 */        
*************************
set seed 230516
clear
matrix C = (1, .7 \ .7, 1)

corr2data x y, n(8) corr(C) means(10 10)
sum y
local yli=r(mean)
gen yli=r(mean)
reg y x
predict yhat

sum x
local min=r(min)

gen x2=x+.01

gen id=_n
generate tss=(y-yli)^2
generate ess=(yhat-yli)^2
generate rss=(yhat-y)^2

reg y x
sum tss
local e1=r(sum)
sum rss
local e2=r(sum)

di (`e1'-`e2')/`e1'   

di `e1'
di `e2' 

 preserve
clear
local new=100*int(sqrt(`e1'))
set obs `new'
gen x=_n
gen y=`new'
local new2=100*int(sqrt(`e2'))
gen y2=`new2'

tw ///
(area y x, 							 lwidth(medthick) fcolor(blue) 	fi(inten20)	lcolor(blue)) ///
(area y2 x if x<=`new2', lwidth(medthick) fcolor(orange)fi(inten20) 	lcolor(orange)), ///
 graphregion(color(white)) scheme(s2mono) xlabel(0 `new', nogrid) ylabel(0 `new', nogrid) ///
  title("D")  legend(off) ylabel(, nogrid)  ysize(1) xsize(1) yscale(off) xscale(off) ///
 xlabel(, nogrid) ytitle("y") saving(D.gph, replace) text(50 50 "ESS", color(orange) size(large)) text(130 100 "TSS", color(blue) size(large)) nodraw
 
 restore



tw (scatter y x, mcolor(black) msymbol(O)) ///
, graphregion(color(white)) scheme(s2mono) ///
  title("A")  legend(off) ylabel(, nogrid)   ///
 xlabel(, nogrid) ytitle("y") saving(A.gph, replace) nodraw
 



tw (rcapsym y yli x, msymbol(i) lwidth(medthick) lcolor(blue)) ///
		(scatter y x, mcolor(black) msymbol(O)) ///
, graphregion(color(white)) scheme(s2mono) ///
  title("B")  legend(off) ylabel(`yli', nogrid) yline(`yli', lpattern(dot)) ///
 xlabel(none, nogrid) ytitle("y") saving(B.gph, replace) nodraw
 
tw (rcapsym y yhat x, msymbol(i) lwidth(medthick) lcolor(orange)) ///
		(scatter y x, mcolor(black) msymbol(O)) ///
(line yhat x, lcolor(black) lpattern(solid)), /// 
graphregion(color(white)) scheme(s2mono) ///
   title("C") legend(off) ylabel(none, nogrid)  ///
 xlabel(none, nogrid) ytitle("y") saving(C.gph, replace) nodraw


graph combine A.gph B.gph C.gph D.gph, cols(2) graphregion(color(white)) scheme(s2mono) ysize(1) xsize(1) 


graph export figure4.png, replace height(2000)

		



exit

*
*contact: koplenig@ids-mannheim.de		
